./../SFARI_genes_01-15-2019.csv of SFARI genes information downloaded from https://gene.sfari.org/database/gene-scoring/. Downloaded version 01-15-19.
SFARI_scraping/genes/gene_name.csv file for each of the SFARI genes with the information of all the reports related to it obtained using the SFARI_scraper.py script and scrapy package.
MeSH_terms_by_paper.json file with all the MeSH terms associated to each of the articles from the SFARI_scraper output files. Retrieved from the NCBI e-utils api using the get_articles_metadata.R script and reutils package. Downloaded on 24/05/19.
Cluster membership for each MeSH term
### GET LIST OF GENES
list_gene_files = list.files('./../SFARI_scraping/genes/')
# GET LIST OF PUBMED IDS
get_pubmed_IDs = function(){
all_pubmed_ids = c()
for(gene_file in list_gene_files){
file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
pubmed_ids = unique(file$pubmed.ID)
all_pubmed_ids = unique(c(all_pubmed_ids, pubmed_ids))
}
all_pubmed_ids = as.character(all_pubmed_ids)
return(all_pubmed_ids)
}
all_pubmed_ids = get_pubmed_IDs()
# CREATE GENE-ARTICLE (SPARSE) MATRIX
SFARI_genes_info = read.csv('./../../Data/SFARI_genes_01-15-2019.csv') %>% mutate('ID'=gene.symbol)
create_gene_pmID_mat = function(){
# Create empty dataframe
gene_pmID_mat = data.frame(matrix(0, nrow=length(list_gene_files), ncol=length(all_pubmed_ids)))
rownames(gene_pmID_mat) = sapply(list_gene_files, function(x) gsub('.csv', '', x))
colnames(gene_pmID_mat) = all_pubmed_ids
# Fillout dataframe
for(gene_file in list_gene_files){
file = read.delim(paste0('./../SFARI_scraping/genes/', gene_file))
pubmed_ids = as.character(unique(file$pubmed.ID))
gene_pmID_mat[gsub('.csv', '', gene_file), pubmed_ids] = 1
}
sparse_gene_pmID_mat = Matrix(as.matrix(gene_pmID_mat), sparse=TRUE)
return(sparse_gene_pmID_mat)
}
gene_pmID_mat = create_gene_pmID_mat()
print(paste0('Genes: ', nrow(gene_pmID_mat), ' Articles: ', ncol(gene_pmID_mat)))
## [1] "Genes: 1053 Articles: 3709"
rm(list_gene_files, all_pubmed_ids)
article_module_mat = read.csv('./../Data/article_module_matrix_wo_qualifiers.csv')
rownames(article_module_mat) = article_module_mat$X
article_module_mat = Matrix(as.matrix(article_module_mat[,-1]), sparse=TRUE)
print(paste0('Articles: ', nrow(article_module_mat), ' Modules: ', ncol(article_module_mat)))
## [1] "Articles: 3345 Modules: 15"
gene_pmID_mat = gene_pmID_mat[,colnames(gene_pmID_mat) %in% rownames(article_module_mat)]
# Check that ordering is the same in both matrices
if(!all(colnames(gene_pmID_mat) == rownames(article_module_mat))){
print("Article ordering doesn't match")
}
gene_module_mat = gene_pmID_mat %*% article_module_mat
print(paste0('Genes: ', nrow(gene_module_mat), ' Modules: ', ncol(gene_module_mat)))
## [1] "Genes: 1053 Modules: 15"
to_keep = rowSums(gene_module_mat)>0
print(paste0('Removing ', sum(!to_keep), ' genes'))
## [1] "Removing 14 genes"
SFARI score distribution of removed genes:
SFARI_genes_info$gene.score[!SFARI_genes_info$ID %in% rownames(gene_module_mat)[to_keep]] %>% table(useNA='ifany')
## .
## 3 4 5
## 3 6 5
# Remove genes
gene_module_mat = gene_module_mat[to_keep,]
rm(to_keep)
# Remove functions to preprocess data
rm(create_gene_pmID_mat, get_pubmed_IDs)
Module 7 is the most related module by far
genes_by_module = data.frame('ID'=colnames(gene_module_mat), 'n_genes'=colSums(gene_module_mat))
ggplotly(genes_by_module %>% ggplot(aes(ID, n_genes, fill=sqrt(n_genes))) + geom_bar(stat='identity') + scale_y_sqrt() +
ggtitle('No. genes related to each module') + xlab('Modules') + ylab('Related genes') +
theme_minimal() + theme(legend.position = 'none'))
rm(genes_by_module)
The higher the SFARI score, the bigger the module membership (makes sense because they have more papers and more MeSH terms). The same relation exists between syndromic and non-syndromic genes.
mesh_terms_by_gene = data.frame('ID'=rownames(gene_module_mat), 'n_Modules'=rowSums(gene_module_mat)) %>%
left_join(SFARI_genes_info, by='ID') %>% dplyr::select(ID, n_Modules, gene.score, syndromic) %>%
mutate(gene.score = as.factor(gene.score), syndromic=as.factor(as.logical(syndromic)))
p1 = ggplotly(mesh_terms_by_gene %>% ggplot(aes(gene.score, n_Modules, fill=gene.score)) + geom_boxplot() +
xlab('Gene Score') + ylab('No. Modules') + theme_minimal() + theme(legend.position='none') +
scale_fill_manual(values=SFARI_colour_hue()) + scale_color_manual(values=SFARI_colour_hue()))
p2 = ggplotly(mesh_terms_by_gene %>% ggplot(aes(syndromic, n_Modules, fill=syndromic)) + geom_boxplot() +
ggtitle('Module membership relation to each gene by score (left) and syndromic tag (right)') +
xlab('Syndromic') + ylab('No. Modules') + theme_minimal() + theme(legend.position='none'))
subplot(p1, p2, nrows=1, widths=c(0.7,0.3))
rm(p1, p2)
Since scores 5 and 6 have no syndromic genes, this tag can be a confounder, but even when separating the genes both by score and syndromic tag, the relation persists
ggplotly(mesh_terms_by_gene %>% ggplot(aes(gene.score, n_Modules, fill=gene.score)) + geom_boxplot() +
ggtitle('Modules membership relation to each gene by score and syndromic tag') +
xlab('Gene Score') + ylab('Module Memberships') + facet_wrap(~syndromic) +
scale_fill_manual(values=SFARI_colour_hue()) + scale_color_manual(values=SFARI_colour_hue()) +
theme_minimal() + theme(legend.position='none'))
rm(mesh_terms_by_gene)
On average, each node is connected to half of the nodes -> strongly connected Network
# EDGES
gene_gene_mat = gene_module_mat %*% t(gene_module_mat)
gene_names = rownames(gene_module_mat)
edges = summary(gene_gene_mat) %>% rename('from'=i, 'to'=j, 'count'=x) %>%
mutate(from = gene_names[from], to = gene_names[to]) %>%
filter(from!=to)
# Convert count to fraction (multiplying by 100 to make the numbers bigger)
gene_module_mat_rowsums = rowSums(gene_module_mat)
names(gene_module_mat_rowsums) = rownames(gene_gene_mat)
edges = edges %>% mutate('weight'=count/(gene_module_mat_rowsums[from]+gene_module_mat_rowsums[to]))
# NODES
nodes = SFARI_genes_info %>% dplyr::select(gene.symbol, gene.score, syndromic, number.of.reports) %>%
mutate(id = gene.symbol, title=gene.symbol, value=number.of.reports,
shape=ifelse(syndromic==0, 'circle','square'),
color_score=case_when(gene.score==1~'#FF7631', gene.score==2~'#FFB100',
gene.score==3~'#E8E328', gene.score==4~'#8CC83F',
gene.score==5~'#62CCA6', gene.score==6~'#59B9C9',
is.na(gene.score) ~ '#b3b3b3')) %>%
mutate('color'=color_score) %>% filter(!duplicated(id)) %>%
filter(gene.symbol %in% unique(c(edges$from, edges$to)))
# Details:
print(paste0('Nodes: ', nrow(nodes),' Edges: ', nrow(edges)/2))
## [1] "Nodes: 1039 Edges: 538919"
rm(gene_module_mat_rowsums, gene_names)
The higher the SFARI score, the higher the eigencentrality of the genes in the Network. Same with syndromic tag
graph = graph_from_data_frame(edges, vertices=nodes, directed=FALSE)
graph = graph %>% simplify(edge.attr.comb='first')
eigen_centrality_output = eigen_centrality(graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
graph = graph %>% set_vertex_attr('eigencentrality', value=eigencentr)
plot_data = data.frame('gene.score'= as.factor(vertex_attr(graph, 'gene.score')),
'Syndromic'= as.logical(vertex_attr(graph, 'syndromic')),
'Eigencentrality'= vertex_attr(graph, 'eigencentrality'))
correlation_scr = cor(nodes$gene.score[!is.na(nodes$gene.score)], eigencentr[!is.na(nodes$gene.score)])
correlation_syn = cor(as.numeric(plot_data$Syndromic), plot_data$Eigencentrality)
p1 = ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none'))
p2 = ggplotly(plot_data %>% ggplot(aes(Syndromic, Eigencentrality, fill=Syndromic)) + geom_boxplot() +
theme_minimal() + theme(legend.position='none') +
ggtitle(paste0('Correlation = ', round(correlation_scr, 2), ' (left) and ', round(correlation_syn,2), ' (right)')))
subplot(p1, p2, nrows=1, widths=c(0.7, 0.3))
rm(correlation_scr, correlation_syn, p1, p2)
When separating the SFARI scores by syndromic tag, the relation between eigencentrality and score persists
corr_non_syn = cor(nodes$gene.score[!is.na(nodes$gene.score) & nodes$syndromic==0],
eigencentr[!is.na(nodes$gene.score) & nodes$syndromic==0])
corr_syn = cor(nodes$gene.score[!is.na(nodes$gene.score) & nodes$syndromic==1],
eigencentr[!is.na(nodes$gene.score) & nodes$syndromic==1])
ggplotly(plot_data %>% ggplot(aes(gene.score, Eigencentrality, fill=gene.score)) + geom_boxplot() +
scale_fill_manual(values=SFARI_colour_hue()) + theme_minimal() + theme(legend.position='none') +
facet_wrap( ~ Syndromic, ncol=2) + ggtitle(paste0('Correlation = ', round(corr_non_syn, 4),
' (left) and ', round(corr_syn, 4), ' (right)')))
rm(plot_data, corr_non_syn, corr_syn, nodes, edges)
plot_data = data.frame('id'=graph %>% get.vertex.attribute('name'), 'eigencentrality'=eigencentr) %>%
top_n(25, wt=eigencentrality)
ggplotly(plot_data %>% ggplot(aes(reorder(id, eigencentrality), eigencentrality, fill=eigencentrality)) +
geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() +
ggtitle(paste0('Top 25 genes with the highest Network centrality')) +
xlab('') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
rm(plot_data)
Note: Network is too densely connected to plot (too heavy and not very informative)
comms_louvain = cluster_louvain(graph)
modules = comms_louvain %>% membership
color_pal = viridis_pal()(max(modules)) %>% substr(1,7)
graph = graph %>% set_vertex_attr('module', value=as.numeric(modules)) %>%
set_vertex_attr('color_modules', value=color_pal[as.numeric(modules)]) %>%
set_vertex_attr('color', value=color_pal[as.numeric(modules)])
comm_sizes = sizes(comms_louvain) %>% data.frame %>% rename(Module=Community.sizes, size=Freq)
ggplotly(comm_sizes %>% ggplot(aes(Module, size, fill=size)) + geom_bar(stat='identity') +
ggtitle('Number of genes in each module') + theme_minimal() + theme(legend.position = 'none'))
hm_data = table(vertex_attr(graph, 'gene.score'), vertex_attr(graph, 'module'), useNA='ifany') %>%
as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none',
key=FALSE, xlab='Module', ylab='SFARI score', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))
hm_data = table(vertex_attr(graph, 'syndromic'), vertex_attr(graph, 'module'), useNA='ifany') %>%
as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', Rowv=F, Colv=F, scale='row', trace='none',
key=FALSE, xlab='Module', ylab='Syndromic', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))
rm(comm_sizes)
Separating by syndromic tag the SFARI scores
hm_data = data.frame('syndromic_gene.score'=paste(vertex_attr(graph, 'syndromic'), '_',vertex_attr(graph, 'gene.score')),
'module'=vertex_attr(graph, 'module'))
hm_data = table(hm_data$syndromic_gene.score, hm_data$module) %>% as.data.frame.matrix %>% as.matrix
heatmap.2(hm_data, cellnote=hm_data, notecol='white', scale='row', trace='none',
key=FALSE, xlab='Module', ylab='Syndromic', col=colorRampPalette(brewer.pal(8, 'RdPu'))(25))
rm(color_pal, hm_data)
Most important genes for each module
module_info = tibble('module'=character(), 'top_term'=character(), size=integer())
for(i in names(sizes(comms_louvain))){
module_vertices = names(modules)[modules==i]
# Creat subgraph
module_graph = induced_subgraph(graph, module_vertices)
# Calculate eigencentrality
eigen_centrality_output = eigen_centrality(module_graph)
eigencentr = as.numeric(eigen_centrality_output$vector)
names(eigencentr) = module_vertices
module_info = module_info %>% add_row('module'=i, 'top_term'=names(eigencentr)[eigencentr==max(eigencentr)][1],
'size'=length(module_vertices))
# Plot eigencentrality of top 10 genes
plot_data = data.frame('gene'=names(eigencentr), 'eigencentrality'=eigencentr) %>%
top_n(10, wt=eigencentrality)
print(plot_data %>% ggplot(aes(reorder(gene, eigencentrality), eigencentrality, fill=eigencentrality)) +
geom_bar(stat='identity') + scale_fill_viridis_c() + coord_flip() +
ggtitle(paste0('Top genes for Module ', i)) +
xlab('Genes') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
}
rm(i, module_vertices, module_graph, eigen_centrality_output, eigencentr, plot_data, module_info)
MeSH term top modules interpretations (kind of):
Module 2: Neuroscience
Module 3: Gene sequencing
Module 7: TF/Proteins
Module 9: DNA
Module 10: TF/Proteins
Module 14: Proteins
Module 19: GTP-Binding/Proteins
Module 1: Neuroscience
Module 2: Proteins? Only Module 7 is in the top modules with description
Module 3: Same as Module 2
for(i in names(sizes(comms_louvain))){
module_genes = names(modules)[modules==i]
# Filter genes belonging to module and column with non-zero values
module_gene_module_mat = gene_module_mat[rownames(gene_module_mat) %in% module_genes,]
module_gene_module_mat = module_gene_module_mat[,colSums(module_gene_module_mat)>0]
# Do PCA and extract 1st PC
pca_module = prcomp(module_gene_module_mat, scale.=TRUE)
module_eigengene = pca_module$rotation[,'PC1']
names(module_eigengene) = colnames(module_gene_module_mat)
# Plot module's most relevant MeSH terms
plot_data = data.frame('module'=names(module_eigengene), 'eigencentrality'=module_eigengene) %>%
mutate('module'=factor(module, levels=c('X1','X2','X3','X4','X5','X6','X7','X8','X9','X10',
'X11','X12','X13','X14','X15'), ordered=TRUE))
print(plot_data %>% ggplot(aes(module, eigencentrality, fill=abs(eigencentrality))) +
geom_bar(position='identity', stat='identity') + scale_fill_viridis_c() + coord_flip() +
ggtitle(paste0('MeSH Term Module Membership for Module ', i)) +
xlab('MeSH Term Module') + ylab('Eigencentrality') + theme_minimal() + theme(legend.position='none'))
}
rm(i, modules, module_genes, module_gene_module_mat, pca_module, module_eigengene, plot_data)